library(splines)
library(stringr)
library(dplyr)
library(ggplot2)
library(gridExtra)
library(DBI)
nhanes_db <- dbConnect(RSQLite::SQLite(), "nhanes.sqlite")
# list all of the tables
dbListTables(nhanes_db)
## [1] "blood_cholesterol" "body_measures" "current_health_status"
## [4] "demo" "diabetes" "diet_total"
## [7] "medical_conditions" "merged_table" "var_decr"
cols <- 'BMXWAIST , RIDAGEYR, BMXHT, BMXBMI, BMXWT, RIAGENDR, years, DR1TM161, WTDRD1, BMXLEG, BMXARML '
data_sql <- paste0('SELECT ', cols, 'FROM merged_table')
dbListTables(nhanes_db)
## [1] "blood_cholesterol" "body_measures" "current_health_status"
## [4] "demo" "diabetes" "diet_total"
## [7] "medical_conditions" "merged_table" "var_decr"
data <- dbGetQuery(nhanes_db, data_sql)
data <- na.omit(data)
dbDisconnect(nhanes_db)
train_ix <- sample(x = 1:nrow(data), size = 6000)
test_ix <- sample(x = setdiff(1:nrow(data), train_ix), 3000)
train_data <- data[train_ix, ]
test_data <- data[test_ix, ]
invNorm <- function(x) {qnorm((rank(x) - 3/8)/(length(x) +1 - 6/8))}
mean_square_error <- function(y_true, y_pred){
round(mean((y_true - y_pred)^2),4)
}
plot_density <- function(data,data_name,col='red'){
d <- density(data)
plot(d,main=paste(data_name,"Density"))
polygon(d, col=col, border="blue")
}
hist(data$RIDAGEYR)
ggplot(data, aes(x=RIDAGEYR, y=BMXWAIST,colour=RIAGENDR)) +
geom_point(alpha=0.1)+
geom_smooth(method = lm, formula = y ~ x)
ggplot(data, aes(x=RIDAGEYR, y=BMXWAIST,colour=RIAGENDR)) +
geom_point(alpha=0.1)+
geom_smooth(method = lm, formula = y ~ bs(x, df=7), size = 1)
ggplot(data, aes(x=RIDAGEYR, y=BMXWAIST,colour=RIAGENDR)) +
geom_point(alpha=0.1)+
geom_smooth(method = lm, formula = y ~ bs(x, df=7), size = 1)+
geom_smooth(method = lm, formula = y ~ x)
ggplot(data, aes(x=RIDAGEYR, y=BMXWAIST) ) +
geom_point(aes(colour=RIAGENDR),alpha=0.2)+
geom_density_2d()
ggplot(data, aes(x=BMXWT, y=BMXWAIST,colour=RIAGENDR) ) +
geom_point(alpha=0.1)+geom_smooth(method = lm,formula = y ~ x)
ggplot(data, aes(x=BMXWT, y=BMXWAIST) ) +
geom_point(aes(colour=RIAGENDR),alpha=0.1)+
geom_density_2d()
ggplot(data, aes(x=BMXBMI, y=BMXWAIST) ) +
geom_point(aes(colour=RIAGENDR),alpha=0.1)+
geom_density_2d()
Regression models
lm_reg = lm(formula = BMXWAIST ~ BMXWT, train_data)
print(summary(lm_reg))
##
## Call:
## lm(formula = BMXWAIST ~ BMXWT, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.4764 -5.0999 -0.1621 4.9584 29.3024
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 43.704013 0.382114 114.4 <2e-16 ***
## BMXWT 0.684642 0.004572 149.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.257 on 5998 degrees of freedom
## Multiple R-squared: 0.789, Adjusted R-squared: 0.789
## F-statistic: 2.243e+04 on 1 and 5998 DF, p-value: < 2.2e-16
train_res_df <- data.frame(
residuals <- lm_reg$residuals,
age <- train_data$RIDAGEYR,
sex <- train_data$RIAGENDR
)
# tags <- c("[21-30)","[30-40)", "[40-50)", "[50-60)", "[60-70)", "[70-80)","[80,120)")
# train_res_df <- train_res_df |>
# mutate(tag = case_when(
# age < 30 ~ tags[1],
# age >= 30 & age < 40 ~ tags[2],
# age >= 40 & age < 50 ~ tags[3],
# age >= 50 & age < 60 ~ tags[4],
# age >= 60 & age < 70 ~ tags[5],
# age >= 70 & age < 80 ~ tags[6],
# age >= 80 ~ tags[7],
#
# )) |> arrange(age)
#create bins for age
tags <- c("[21-25)","[25-30)", "[30-35)", "[35-40)","[40-45)", "[45-50)", "[50-55)",
"[55,60)","[60-65)", "[65-70)", "[70-75)", "[75-80)", "[80-85)","[85,120)")
train_res_df <- train_res_df |>
mutate(tag = case_when(
age < 25 ~ tags[1],
age >= 25 & age < 30 ~ tags[2],
age >= 30 & age < 35 ~ tags[3],
age >= 35 & age < 40 ~ tags[4],
age >= 40 & age < 45 ~ tags[5],
age >= 45 & age < 50 ~ tags[6],
age >= 50 & age < 55 ~ tags[7],
age >= 55 & age < 60 ~ tags[8],
age >= 60 & age < 65 ~ tags[9],
age >= 65 & age < 70 ~ tags[10],
age >= 70 & age < 75 ~ tags[11],
age >= 75 & age < 80 ~ tags[12],
age >= 80 & age < 85 ~ tags[13],
age >= 85 ~ tags[14],
)) |> arrange(age)
train_res_df$tag <- as.factor(train_res_df$tag)
ggplot(data = train_res_df, mapping = aes(x=tag,y=residuals,color=sex)) +
geom_jitter(alpha=0.2)+
geom_boxplot(fill="bisque",color="black",alpha=0.3)+
stat_summary(fun = mean, color="blue", shape=20,size=1)+
theme(axis.text.x = element_text(angle = 45))
## Warning: Removed 14 rows containing missing values (geom_segment).
test_df <- test_data |> select(-BMXWAIST)
lm_pred = predict(lm_reg, newdata = test_df, se = T)
# save prediction results
pred_df = data.frame(
fit = lm_pred$fit,
lwr = lm_pred$fit - 2 * lm_pred$se.fit,
upr = lm_pred$fit + 2 * lm_pred$se.fit,
weight = test_data$BMXWT,
sex = test_data$RIAGENDR,
age <- train_data$RIDAGEYR,
label = test_data$BMXWAIST
)
## Warning in data.frame(fit = lm_pred$fit, lwr = lm_pred$fit - 2 *
## lm_pred$se.fit, : row names were found from a short variable and have been
## discarded
ggplot(pred_df, aes(x = weight, y = label,color=sex) ) +
geom_point(alpha=0.2) +
geom_ribbon( aes(ymin = lwr, ymax = upr, fill = sex), alpha = .15) +
geom_line(aes(y = fit), size = 1)
Regression models with age
lm_reg = lm(formula = BMXWAIST ~ BMXWT + RIDAGEYR, train_data)
lm_pred = predict(lm_reg, newdata = test_df, se = T)
lm_reg2 = lm(formula = BMXWAIST ~ BMXWT + bs(RIDAGEYR,df=7), train_data)
lm_pred2 = predict(lm_reg2, newdata = test_df, se = T)
# save prediction results
pred_df = data.frame(
fit1 = lm_pred$fit,
fit2 = lm_pred2$fit,
weight = test_data$BMXWT,
sex = test_data$RIAGENDR,
age <- train_data$RIDAGEYR,
label = test_data$BMXWAIST
)
## Warning in data.frame(fit1 = lm_pred$fit, fit2 = lm_pred2$fit, weight =
## test_data$BMXWT, : row names were found from a short variable and have been
## discarded
ggplot(pred_df, aes(x = weight, y = label)) + geom_point(colour = "black",alpha = 0.1) +
geom_point(aes(y=fit1,x= weight,color=sex),alpha = 0.3)
ggplot(pred_df, aes(x = weight, y = label)) + geom_point(colour = "black",alpha = 0.1) +
geom_point(data=pred_df,aes(y=fit1,x= weight,colour="lm"),alpha = 0.1)+
geom_point(data=pred_df,aes(y=fit2,x= weight,colour="lm_bs"),alpha = 0.1) +
scale_color_manual(name = "models", values = c("lm" = "darkblue", "lm_bs" = "red"))
qplot(x=fit1,y=fit2,data=pred_df,colour=sex,alpha=I(0.1))